### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(tidytext)

This session

Refresher:

Bag of words model

  • In order for a computer to understand text we need to somehow find a useful representation.
  • If you need to compare different texts e.g. articles, you will probably go for keywords. These keywords may come from a keyword-list with for example 200 different keywords
  • In that case you could represent each document with a (sparse) vector with 1 for “keyword present” and 0 for “keyword absent”
  • We can also get a bit more sophoistocated and count the number of times a word from our dictionary occurs.
  • For a corpus of documents that would give us a document-term matrix.

example

Let’s try creating a bag of words model from our initial example.

text <- tibble(id = c(1:6),
               text = c('A text about cats.',
                        'A text about dogs.',
                        'And another text about a dog.',
                        'Why always writing about cats and dogs, always dogs?',
                        'There are too little text about cats but to many about dogs',
                        'Cats, cats, cats! I love cats soo much. Cats are way better than dogs'))
text_tidy <- text %>% 
  unnest_tokens(word, text, token = 'words') %>% 
  count(id, word)

The document-term matrix (DTM)

  • The simplest form of vector representation of text is a ddocument-term matrix
  • How to we get a document-term matrix now?
  • We could do it by hand, with well-known dplyr syntax (Note: only works when you have one row per unique document-word pair)
text_tidy %>%
  pivot_wider(names_from = word, values_from = n, values_fill = 0)
  • We could also use cast_dtm() to create a DTM in the format of the tm package.
text_dtm <- text_tidy %>%
  cast_dtm(id, word, n)
text_dtm 
<<DocumentTermMatrix (documents: 6, terms: 25)>>
Non-/sparse entries: 42/108
Sparsity           : 72%
Maximal term length: 7
Weighting          : term frequency (tf)
  • We can simply convert ig to a tibble. Since there exists no direct transfer function, we have to first transform it to a matrix.
  • Notice how we recover the rownames
text_dtm %>% as.matrix() %>% as_tibble(rownames = 'id') 
  • Sidenote: We can also tidy the DTM again to a tidy token-dataframe.
text_dtm %>% tidy()
  • We also can directly use a similar function to cast a sparse matrix (which we for sure then also could transform to a tibble again)
text_tidy %>% cast_sparse(row = id, column = word, value = n)
6 x 25 sparse Matrix of class "dgCMatrix"
                                                   
1 1 1 1 1 . . . . . . . . . . . . . . . . . . . . .
2 1 1 . 1 1 . . . . . . . . . . . . . . . . . . . .
3 1 1 . 1 . 1 1 1 . . . . . . . . . . . . . . . . .
4 . 1 1 . 2 1 . . 2 1 1 . . . . . . . . . . . . . .
5 . 2 1 1 1 . . . . . . 1 1 1 1 1 1 1 . . . . . . .
6 . . 5 . 1 . . . . . . 1 . . . . . . 1 1 1 1 1 1 1
  • Finally, we could just apply a text recipe here
library(recipes)
library(textrecipes)
text %>%
  recipe(~.) %>% 
  step_tokenize(text, token = 'words') %>% # tokenize
  step_tf(text) %>% # TFIDF weighting
  prep() %>% juice()

TF-IDF - Term Frequency - Inverse Document Frequency

  • A token is important for a document if appears very often
  • A token becomes less important for comparison across a corpus if it appears all over the place in the corpus
  • Cat in a corpus of websites talking about cats is not that important

\[w_{i,j} = tf_{i,j}*log(\frac{N}{df_i})\]

  • \(w_{i,j}\) = the TF-IDF score for a term i in a document j
  • \(tf_{i,j}\) = number of occurence of term i in document j
  • \(N\) = number of documents in the corpus
  • \(df_i\) = number of documents with term i
# TFIDF weights
text_tidy %<>%
  bind_tf_idf(term = word,
              document = id,
              n = n)
  • We obviously could also cast a tf_idf weighted dtm…
text_tidy %>%
  select(id, word, tf_idf) %>%
  pivot_wider(names_from = word, values_from = tf_idf, values_fill = 0)
  • btw: this is equivalent to just running a textrecipe like that:
text %>%
  recipe(~.) %>% 
  step_tokenize(text, token = 'words') %>% # tokenize
  step_tfidf(text) %>% # TFIDF weighting
  prep() %>% juice()
  • A last reminder on the powerful pairwise_xx() functions from the widyr package
  • For instance, pair
library(widyr)
text_tidy %>% pairwise_dist(id, word, tf_idf, method = "manhattan") %>%
  mutate(similarity = 1 - (distance / max(distance)) ) %>%
  select(-distance) %>%
  arrange(desc(similarity))

Dimensionality reduction techniques

rm(list=ls())
  • Ok, lets get first some more interesting
text %<>%
  rename(id = X1) %>%
  filter(language == 'en')
# preprocessing
text_tidy %<>%
  #mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
  filter(str_length(word) > 2 ) %>% # Remove words with less than  3 characters
  group_by(word) %>%
  filter(n() > 100) %>% # remove words occuring less than 100 times
  ungroup() %>%
  anti_join(stop_words, by = 'word') # remove stopwords

PCA

text_pca <- text_dtm %>% 
  column_to_rownames('id') %>% 
  prcomp(center = TRUE, scale. = TRUE)
text_pca 
Standard deviations (1, .., p=6):
[1] 3.207823e+00 2.759080e+00 2.234460e+00 1.388305e+00 4.208882e-01 6.333504e-16

Rotation (n x k) = (25 x 6):
                PC1         PC2           PC3         PC4         PC5          PC6
a       -0.12298966 -0.20293146  0.3096529650 -0.16145250 -0.10294565 -0.242719650
about   -0.27381282  0.16761993 -0.0523034736  0.01945303  0.03687218  0.621255802
cats     0.29282077  0.10623353 -0.0259909788  0.02013339  0.39484526 -0.139416189
text    -0.22594973  0.04505067  0.2965162521 -0.09825013 -0.09569899 -0.609033136
dogs     0.05438080  0.04321488 -0.4058754433 -0.06481897 -0.83922505 -0.056366491
and     -0.07300087 -0.21088577 -0.1718210259  0.48798584  0.03698842 -0.088662025
another -0.07475529 -0.14019613  0.1983802046  0.55517809 -0.11432042  0.019762232
dog     -0.07475529 -0.14019613  0.1983802046  0.55517809 -0.11432042  0.019762232
always  -0.01758432 -0.12655562 -0.4157185213  0.06208059  0.16110748 -0.177054662
why     -0.01758432 -0.12655562 -0.4157185213  0.06208059  0.16110748 -0.177054662
writing -0.01758432 -0.12655562 -0.4157185213  0.06208059  0.16110748 -0.177054662
are      0.14435186  0.31529231  0.0002177812  0.12216722 -0.01817629 -0.173880335
but     -0.12079837  0.32924621 -0.0403763595  0.09233359  0.01706538 -0.068982991
little  -0.12079837  0.32924621 -0.0403763595  0.09233359  0.01706538 -0.068982991
many    -0.12079837  0.32924621 -0.0403763595  0.09233359  0.01706538 -0.068982991
there   -0.12079837  0.32924621 -0.0403763595  0.09233359  0.01706538 -0.068982991
to      -0.12079837  0.32924621 -0.0403763595  0.09233359  0.01706538 -0.068982991
too     -0.12079837  0.32924621 -0.0403763595  0.09233359  0.01706538 -0.068982991
better   0.30339063  0.06957053  0.0406518334  0.06219708 -0.04005677  0.004133281
i        0.30339063  0.06957053  0.0406518334  0.06219708 -0.04005677  0.004133281
love     0.30339063  0.06957053  0.0406518334  0.06219708 -0.04005677  0.004133281
much     0.30339063  0.06957053  0.0406518334  0.06219708 -0.04005677  0.004133281
soo      0.30339063  0.06957053  0.0406518334  0.06219708 -0.04005677  0.004133281
than     0.30339063  0.06957053  0.0406518334  0.06219708 -0.04005677  0.004133281
way      0.30339063  0.06957053  0.0406518334  0.06219708 -0.04005677  0.004133281
text_pca[['x']]
         PC1       PC2        PC3        PC4          PC5           PC6
1 -0.9053472 -1.026254  1.3687094 -1.4697533  0.659153358  4.284767e-16
2 -0.9903765 -1.025903  0.8434948 -1.5666736 -0.667757859  1.538700e-15
3 -1.5702074 -2.178507  2.0217993  2.1842199 -0.041338206 -2.376571e-16
4 -0.3693521 -1.966548 -4.2368110  0.2442417  0.058256383 -5.152129e-16
5 -2.5373253  5.116156 -0.4114972  0.3632652  0.006170831 -1.451964e-15
6  6.3726085  1.081056  0.4143047  0.2447001 -0.014484506 -7.667478e-16
  • Again, alternatively with a recipe…
text_pca <- text %>%
  recipe(~.) %>% 
  update_role(id, new_role = "id") %>%
  step_tokenize(text, token = 'words') %>% # tokenize
  step_tfidf(text, prefix = NULL) %>% # TFIDF weighting
  step_pca(all_predictors(), num_comp = 3) %>% # PCA
  prep() 
text_pca %>% juice()
text_pca %>%
  tidy(3) %>%
  filter(component %in% paste0("PC", 1:3)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)
library(embed)
text_UMAP <- text %>%
  recipe(~.) %>% 
  update_role(id, new_role = "id") %>%
  step_tokenize(text, token = 'words') %>% # tokenize
  step_tfidf(text, prefix = NULL) %>% # TFIDF weighting
  step_umap(all_predictors(), n_neighbors = 2) %>%
  prep() 
Error in uwot(X = X, n_neighbors = n_neighbors, n_components = n_components,  : 
  n_neighbors must be smaller than the dataset size

Topic Models: LDA

#UMAP

Embeddings (Bonus)

glove6b <- embedding_glove42b(dimensions =100)
Error in embedding_glove42b(dimensions = 100) : 
  unused argument (dimensions = 100)

Summary

LS0tCnRpdGxlOiAnKFNvbWV3aGF0KSBhZHZhbmNlZCBOTFA6IHRleHQgdmVjdG9yaXphdGlvbicKYXV0aG9yOiAiRGFuaWVsIFMuIEhhaW4gKGRzaEBidXNpbmVzcy5hYXUuZGspIgpkYXRlOiAiVXBkYXRlZCBgciBmb3JtYXQoU3lzLnRpbWUoKSwgJyVCICVkLCAlWScpYCIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGRmX3ByaW50OiBwYWdlZAogICAgdG9jOiB0cnVlCiAgICB0b2NfZGVwdGg6IDIKICAgIHRvY19mbG9hdDoKICAgICAgY29sbGFwc2VkOiBmYWxzZQogICAgdGhlbWU6IGZsYXRseQotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQojIyMgR2VuZXJpYyBwcmVhbWJsZQpybShsaXN0PWxzKCkpClN5cy5zZXRlbnYoTEFORyA9ICJlbiIpICMgRm9yIGVuZ2xpc2ggbGFuZ3VhZ2UKb3B0aW9ucyhzY2lwZW4gPSA1KSAjIFRvIGRlYWN0aXZhdGUgYW5ub3lpbmcgc2NpZW50aWZpYyBudW1iZXIgbm90YXRpb24KCiMjIyBLbml0ciBvcHRpb25zCmxpYnJhcnkoa25pdHIpICMgRm9yIGRpc3BsYXkgb2YgdGhlIG1hcmtkb3duCmtuaXRyOjpvcHRzX2NodW5rJHNldCh3YXJuaW5nPUZBTFNFLAogICAgICAgICAgICAgICAgICAgICBtZXNzYWdlPUZBTFNFLAogICAgICAgICAgICAgICAgICAgICBjb21tZW50PUZBTFNFLCAKICAgICAgICAgICAgICAgICAgICAgZmlnLmFsaWduPSJjZW50ZXIiCiAgICAgICAgICAgICAgICAgICAgICkKYGBgCgpgYGB7cn0KIyMjIExvYWQgc3RhbmRhcmRwYWNrYWdlcwpsaWJyYXJ5KHRpZHl2ZXJzZSkgIyBDb2xsZWN0aW9uIG9mIGFsbCB0aGUgZ29vZCBzdHVmZiBsaWtlIGRwbHlyLCBnZ3Bsb3QyIGVjdC4KbGlicmFyeShtYWdyaXR0cikgIyBGb3IgZXh0cmEtcGlwaW5nIG9wZXJhdG9ycyAoZWcuICU8PiUpCmBgYAoKYGBge3J9CmxpYnJhcnkodGlkeXRleHQpCmBgYAoKIyBUaGlzIHNlc3Npb24KCgojIFJlZnJlc2hlcjoKCiFbXShodHRwczovL3Nkcy1hYXUuZ2l0aHViLmlvL1NEUy1tYXN0ZXIvMDBfbWVkaWEvbmxwX3RpZHl3b3JrZmxvdy5wbmcpCgoKIyBCYWcgb2Ygd29yZHMgbW9kZWwKCiogSW4gb3JkZXIgZm9yIGEgY29tcHV0ZXIgdG8gdW5kZXJzdGFuZCB0ZXh0IHdlIG5lZWQgdG8gc29tZWhvdyBmaW5kIGEgdXNlZnVsIHJlcHJlc2VudGF0aW9uLgoqIElmIHlvdSBuZWVkIHRvIGNvbXBhcmUgZGlmZmVyZW50IHRleHRzIGUuZy4gYXJ0aWNsZXMsIHlvdSB3aWxsIHByb2JhYmx5IGdvIGZvciBrZXl3b3Jkcy4gVGhlc2Uga2V5d29yZHMgbWF5IGNvbWUgZnJvbSBhIGtleXdvcmQtbGlzdCB3aXRoIGZvciBleGFtcGxlIDIwMCBkaWZmZXJlbnQga2V5d29yZHMKKiBJbiB0aGF0IGNhc2UgeW91IGNvdWxkIHJlcHJlc2VudCBlYWNoIGRvY3VtZW50IHdpdGggYSAoc3BhcnNlKSB2ZWN0b3Igd2l0aCAxIGZvciAia2V5d29yZCBwcmVzZW50IiBhbmQgMCBmb3IgImtleXdvcmQgYWJzZW50IgoqIFdlIGNhbiBhbHNvIGdldCBhIGJpdCBtb3JlIHNvcGhvaXN0b2NhdGVkIGFuZCBjb3VudCB0aGUgbnVtYmVyIG9mIHRpbWVzIGEgd29yZCBmcm9tIG91ciBkaWN0aW9uYXJ5IG9jY3Vycy4KKiBGb3IgYSBjb3JwdXMgb2YgZG9jdW1lbnRzIHRoYXQgd291bGQgZ2l2ZSB1cyBhIGRvY3VtZW50LXRlcm0gbWF0cml4LgoKIVtleGFtcGxlXShodHRwczovL2kuc3RhY2suaW1ndXIuY29tL0MxVU1zLnBuZykKCkxldCdzIHRyeSBjcmVhdGluZyBhIGJhZyBvZiB3b3JkcyBtb2RlbCBmcm9tIG91ciBpbml0aWFsIGV4YW1wbGUuCgpgYGB7cn0KdGV4dCA8LSB0aWJibGUoaWQgPSBjKDE6NiksCiAgICAgICAgICAgICAgIHRleHQgPSBjKCdBIHRleHQgYWJvdXQgY2F0cy4nLAogICAgICAgICAgICAgICAgICAgICAgICAnQSB0ZXh0IGFib3V0IGRvZ3MuJywKICAgICAgICAgICAgICAgICAgICAgICAgJ0FuZCBhbm90aGVyIHRleHQgYWJvdXQgYSBkb2cuJywKICAgICAgICAgICAgICAgICAgICAgICAgJ1doeSBhbHdheXMgd3JpdGluZyBhYm91dCBjYXRzIGFuZCBkb2dzLCBhbHdheXMgZG9ncz8nLAogICAgICAgICAgICAgICAgICAgICAgICAnVGhlcmUgYXJlIHRvbyBsaXR0bGUgdGV4dCBhYm91dCBjYXRzIGJ1dCB0byBtYW55IGFib3V0IGRvZ3MnLAogICAgICAgICAgICAgICAgICAgICAgICAnQ2F0cywgY2F0cywgY2F0cyEgSSBsb3ZlIGNhdHMgc29vIG11Y2guIENhdHMgYXJlIHdheSBiZXR0ZXIgdGhhbiBkb2dzJykpCmBgYAoKYGBge3J9CnRleHRfdGlkeSA8LSB0ZXh0ICU+JSAKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQsIHRva2VuID0gJ3dvcmRzJykgJT4lIAogIGNvdW50KGlkLCB3b3JkKQpgYGAKCgojIyBUaGUgZG9jdW1lbnQtdGVybSBtYXRyaXggKERUTSkKCiogVGhlIHNpbXBsZXN0IGZvcm0gb2YgdmVjdG9yIHJlcHJlc2VudGF0aW9uIG9mIHRleHQgaXMgYSBkZG9jdW1lbnQtdGVybSBtYXRyaXgKKiBIb3cgdG8gd2UgZ2V0IGEgZG9jdW1lbnQtdGVybSBtYXRyaXggbm93PwoqIFdlIGNvdWxkIGRvIGl0IGJ5IGhhbmQsIHdpdGggd2VsbC1rbm93biBgZHBseXJgIHN5bnRheCAoTm90ZTogb25seSB3b3JrcyB3aGVuIHlvdSBoYXZlIG9uZSByb3cgcGVyIHVuaXF1ZSBkb2N1bWVudC13b3JkIHBhaXIpCgpgYGB7cn0KdGV4dF90aWR5ICU+JQogIHBpdm90X3dpZGVyKG5hbWVzX2Zyb20gPSB3b3JkLCB2YWx1ZXNfZnJvbSA9IG4sIHZhbHVlc19maWxsID0gMCkKYGBgCgoqIFdlIGNvdWxkIGFsc28gdXNlIGBjYXN0X2R0bSgpYCB0byBjcmVhdGUgYSBEVE0gaW4gdGhlIGZvcm1hdCBvZiB0aGUgYHRtYCBwYWNrYWdlLgoKYGBge3J9CnRleHRfZHRtIDwtIHRleHRfdGlkeSAlPiUKICBjYXN0X2R0bShpZCwgd29yZCwgbikKYGBgCgpgYGB7cn0KdGV4dF9kdG0gCmBgYAoKKiBXZSBjYW4gc2ltcGx5IGNvbnZlcnQgaWcgdG8gYSB0aWJibGUuIFNpbmNlIHRoZXJlIGV4aXN0cyBubyBkaXJlY3QgdHJhbnNmZXIgZnVuY3Rpb24sIHdlIGhhdmUgdG8gZmlyc3QgdHJhbnNmb3JtIGl0IHRvIGEgbWF0cml4LgoqIE5vdGljZSBob3cgd2UgcmVjb3ZlciB0aGUgcm93bmFtZXMKCmBgYHtyfQp0ZXh0X2R0bSAlPiUgYXMubWF0cml4KCkgJT4lIGFzX3RpYmJsZShyb3duYW1lcyA9ICdpZCcpIApgYGAKCiogU2lkZW5vdGU6IFdlIGNhbiBhbHNvIHRpZHkgdGhlIERUTSBhZ2FpbiB0byBhIHRpZHkgdG9rZW4tZGF0YWZyYW1lLgoKYGBge3J9CnRleHRfZHRtICU+JSB0aWR5KCkKYGBgCiogV2UgYWxzbyBjYW4gZGlyZWN0bHkgdXNlIGEgc2ltaWxhciBmdW5jdGlvbiB0byBjYXN0IGEgc3BhcnNlIG1hdHJpeCAod2hpY2ggd2UgZm9yIHN1cmUgdGhlbiBhbHNvIGNvdWxkIHRyYW5zZm9ybSB0byBhIHRpYmJsZSBhZ2FpbikKCmBgYHtyfQp0ZXh0X3RpZHkgJT4lIGNhc3Rfc3BhcnNlKHJvdyA9IGlkLCBjb2x1bW4gPSB3b3JkLCB2YWx1ZSA9IG4pCmBgYAoKKiBGaW5hbGx5LCB3ZSBjb3VsZCBqdXN0IGFwcGx5IGEgdGV4dCByZWNpcGUgaGVyZQoKYGBge3J9CmxpYnJhcnkocmVjaXBlcykKbGlicmFyeSh0ZXh0cmVjaXBlcykKYGBgCgpgYGB7cn0KdGV4dCAlPiUKICByZWNpcGUofi4pICU+JSAKICBzdGVwX3Rva2VuaXplKHRleHQsIHRva2VuID0gJ3dvcmRzJykgJT4lICMgdG9rZW5pemUKICBzdGVwX3RmKHRleHQpICU+JSAjIFRGSURGIHdlaWdodGluZwogIHByZXAoKSAlPiUganVpY2UoKQpgYGAKCgojIyBURi1JREYgLSBUZXJtIEZyZXF1ZW5jeSAtIEludmVyc2UgRG9jdW1lbnQgRnJlcXVlbmN5CgoqIEEgdG9rZW4gaXMgaW1wb3J0YW50IGZvciBhIGRvY3VtZW50IGlmIGFwcGVhcnMgdmVyeSBvZnRlbgoqIEEgdG9rZW4gYmVjb21lcyBsZXNzIGltcG9ydGFudCBmb3IgY29tcGFyaXNvbiBhY3Jvc3MgYSBjb3JwdXMgaWYgaXQgYXBwZWFycyBhbGwgb3ZlciB0aGUgcGxhY2UgaW4gdGhlIGNvcnB1cwoqICpDYXQqIGluIGEgY29ycHVzIG9mIHdlYnNpdGVzIHRhbGtpbmcgYWJvdXQgY2F0cyBpcyBub3QgdGhhdCBpbXBvcnRhbnQKCiQkd197aSxqfSA9IHRmX3tpLGp9KmxvZyhcZnJhY3tOfXtkZl9pfSkkJAoKLSAkd197aSxqfSQgPSB0aGUgVEYtSURGIHNjb3JlIGZvciBhIHRlcm0gaSBpbiBhIGRvY3VtZW50IGoKLSAkdGZfe2ksan0kID0gbnVtYmVyIG9mIG9jY3VyZW5jZSBvZiB0ZXJtIGkgaW4gZG9jdW1lbnQgagotICROJCA9IG51bWJlciBvZiBkb2N1bWVudHMgaW4gdGhlIGNvcnB1cwotICRkZl9pJCA9IG51bWJlciBvZiBkb2N1bWVudHMgd2l0aCB0ZXJtIGkKCmBgYHtyfQojIFRGSURGIHdlaWdodHMKdGV4dF90aWR5ICU8PiUKICBiaW5kX3RmX2lkZih0ZXJtID0gd29yZCwKICAgICAgICAgICAgICBkb2N1bWVudCA9IGlkLAogICAgICAgICAgICAgIG4gPSBuKQpgYGAKCiogV2Ugb2J2aW91c2x5IGNvdWxkIGFsc28gY2FzdCBhIHRmX2lkZiB3ZWlnaHRlZCBkdG0uLi4KCmBgYHtyfQp0ZXh0X3RpZHkgJT4lCiAgc2VsZWN0KGlkLCB3b3JkLCB0Zl9pZGYpICU+JQogIHBpdm90X3dpZGVyKG5hbWVzX2Zyb20gPSB3b3JkLCB2YWx1ZXNfZnJvbSA9IHRmX2lkZiwgdmFsdWVzX2ZpbGwgPSAwKQpgYGAKCiogYnR3OiB0aGlzIGlzIGVxdWl2YWxlbnQgdG8ganVzdCBydW5uaW5nIGEgdGV4dHJlY2lwZSBsaWtlIHRoYXQ6CgpgYGB7cn0KdGV4dCAlPiUKICByZWNpcGUofi4pICU+JSAKICBzdGVwX3Rva2VuaXplKHRleHQsIHRva2VuID0gJ3dvcmRzJykgJT4lICMgdG9rZW5pemUKICBzdGVwX3RmaWRmKHRleHQpICU+JSAjIFRGSURGIHdlaWdodGluZwogIHByZXAoKSAlPiUganVpY2UoKQpgYGAKCiogQSBsYXN0IHJlbWluZGVyIG9uIHRoZSBwb3dlcmZ1bCBgcGFpcndpc2VfeHgoKWAgZnVuY3Rpb25zIGZyb20gdGhlIGB3aWR5cmAgcGFja2FnZQoqIEZvciBpbnN0YW5jZSwgcGFpcgoKYGBge3J9CmxpYnJhcnkod2lkeXIpCmBgYAoKYGBge3J9CnRleHRfdGlkeSAlPiUgcGFpcndpc2VfZGlzdChpZCwgd29yZCwgdGZfaWRmLCBtZXRob2QgPSAibWFuaGF0dGFuIikgJT4lCiAgbXV0YXRlKHNpbWlsYXJpdHkgPSAxIC0gKGRpc3RhbmNlIC8gbWF4KGRpc3RhbmNlKSkgKSAlPiUKICBzZWxlY3QoLWRpc3RhbmNlKSAlPiUKICBhcnJhbmdlKGRlc2Moc2ltaWxhcml0eSkpCmBgYAoKCgojIERpbWVuc2lvbmFsaXR5IHJlZHVjdGlvbiB0ZWNobmlxdWVzCgpgYGB7cn0Kcm0obGlzdD1scygpKQpgYGAKCiogT2ssIGxldHMgZ2V0IGZpcnN0IHNvbWUgbW9yZSBpbnRlcmVzdGluZyAKCmBgYHtyfQp0ZXh0IDwtIHJlYWRfY3N2KCdodHRwczovL2dpdGh1Yi5jb20vU0RTLUFBVS9TRFMtbWFzdGVyL3Jhdy9tYXN0ZXIvTTIvZGF0YS9jb3JkaXMtaDIwMjByZXBvcnRzLmd6JykKYGBgCgpgYGB7cn0KdGV4dCAlPD4lCiAgcmVuYW1lKGlkID0gWDEpICU+JQogIGZpbHRlcihsYW5ndWFnZSA9PSAnZW4nKQpgYGAKCgoKYGBge3J9CiMgcHJlcHJvY2Vzc2luZwp0ZXh0X3RpZHkgJTw+JQogICNtdXRhdGUod29yZCA9IHdvcmQgJT4lIHN0cl9yZW1vdmVfYWxsKCdbXls6YWxudW06XV0nKSkgJT4lICMjIHJlbW92ZSBhbGwgc3BlY2lhbCBjaGFyYWN0ZXJzCiAgZmlsdGVyKHN0cl9sZW5ndGgod29yZCkgPiAyICkgJT4lICMgUmVtb3ZlIHdvcmRzIHdpdGggbGVzcyB0aGFuICAzIGNoYXJhY3RlcnMKICBncm91cF9ieSh3b3JkKSAlPiUKICBmaWx0ZXIobigpID4gMTAwKSAlPiUgIyByZW1vdmUgd29yZHMgb2NjdXJpbmcgbGVzcyB0aGFuIDEwMCB0aW1lcwogIHVuZ3JvdXAoKSAlPiUKICBhbnRpX2pvaW4oc3RvcF93b3JkcywgYnkgPSAnd29yZCcpICMgcmVtb3ZlIHN0b3B3b3JkcwpgYGAKCgoKCgoKYGBge3IsIGluY2x1ZGU9RkFMU0V9CnRleHRfZHRtIDwtIHRleHQgJT4lCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0LCB0b2tlbiA9ICd3b3JkcycpICU+JSAKICBjb3VudChpZCwgd29yZCkgJT4lCiAgcGl2b3Rfd2lkZXIobmFtZXNfZnJvbSA9IHdvcmQsIHZhbHVlc19mcm9tID0gbiwgdmFsdWVzX2ZpbGwgPSAwKQpgYGAKCgoKIyMgUENBCgpgYGB7cn0KdGV4dF9wY2EgPC0gdGV4dF9kdG0gJT4lIAogIGNvbHVtbl90b19yb3duYW1lcygnaWQnKSAlPiUgCiAgcHJjb21wKGNlbnRlciA9IFRSVUUsIHNjYWxlLiA9IFRSVUUpCmBgYAoKYGBge3J9CnRleHRfcGNhICU+JSBnbGltcHNlKCkKYGBgCgpgYGB7cn0KdGV4dF9wY2FbWyd4J11dCmBgYAoKKiBBZ2FpbiwgYWx0ZXJuYXRpdmVseSB3aXRoIGEgcmVjaXBlLi4uCgpgYGB7cn0KdGV4dF9wY2EgPC0gdGV4dCAlPiUKICByZWNpcGUofi4pICU+JSAKICB1cGRhdGVfcm9sZShpZCwgbmV3X3JvbGUgPSAiaWQiKSAlPiUKICBzdGVwX3Rva2VuaXplKHRleHQsIHRva2VuID0gJ3dvcmRzJykgJT4lICMgdG9rZW5pemUKICBzdGVwX3RmaWRmKHRleHQsIHByZWZpeCA9IE5VTEwpICU+JSAjIFRGSURGIHdlaWdodGluZwogIHN0ZXBfcGNhKGFsbF9wcmVkaWN0b3JzKCksIG51bV9jb21wID0gMykgJT4lICMgUENBCiAgcHJlcCgpIApgYGAKCmBgYHtyfQp0ZXh0X3BjYSAlPiUganVpY2UoKQpgYGAKCmBgYHtyfQp0ZXh0X3BjYSAlPiUKICB0aWR5KDMpICU+JQogIGZpbHRlcihjb21wb25lbnQgJWluJSBwYXN0ZTAoIlBDIiwgMTozKSkgJT4lCiAgbXV0YXRlKGNvbXBvbmVudCA9IGZjdF9pbm9yZGVyKGNvbXBvbmVudCkpICU+JQogIGdncGxvdChhZXModmFsdWUsIHRlcm1zLCBmaWxsID0gdGVybXMpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGZhY2V0X3dyYXAofmNvbXBvbmVudCwgbnJvdyA9IDEpICsKICBsYWJzKHkgPSBOVUxMKQpgYGAKCgpgYGB7cn0KbGlicmFyeShlbWJlZCkKYGBgCgpgYGB7cn0KdGV4dF9VTUFQIDwtIHRleHQgJT4lCiAgcmVjaXBlKH4uKSAlPiUgCiAgdXBkYXRlX3JvbGUoaWQsIG5ld19yb2xlID0gImlkIikgJT4lCiAgc3RlcF90b2tlbml6ZSh0ZXh0LCB0b2tlbiA9ICd3b3JkcycpICU+JSAjIHRva2VuaXplCiAgc3RlcF90ZmlkZih0ZXh0LCBwcmVmaXggPSBOVUxMKSAlPiUgIyBURklERiB3ZWlnaHRpbmcKICBzdGVwX3VtYXAoYWxsX3ByZWRpY3RvcnMoKSwgbl9uZWlnaGJvcnMgPSAyKSAlPiUKICBwcmVwKCkgCmBgYAoKCgoKCgoKIyMgVG9waWMgTW9kZWxzOiBMREEKCgoKCmBgYHtyfQojVU1BUApgYGAKCgoKIyBFbWJlZGRpbmdzIChCb251cykKCmBgYHtyfQpsaWJyYXJ5KHRleHRkYXRhKQoKZ2xvdmU2YiA8LSBlbWJlZGRpbmdfZ2xvdmUyN2IoZGltZW5zaW9ucyA9IDEwMCkKZ2xvdmU2YgojIFRoZXNlIG1lYmVkZGluZ3MgY2FuIG5vdyBiZSBsb2FkZWQgd2l0aCBzdGVwX3dvcmRlbWJlZGRpbmdzCmBgYAoKCgoKIyBTdW1tYXJ5CgoKCgoKCg==